################################################################################
# 01_clean_ets_data.R
# Clean ETS/Praxis Data and Create Test Difficulty Index (TDI)
#
# Inputs:
#   - data/raw/ets/cleaned ppst and core and composite.dta
#   - data/raw/ets/states_we_add_back_in.xlsx
#
# Outputs:
#   - data/cleaned/ets_treatment_data.xlsx
################################################################################

library(tidyverse)
library(readxl)
library(writexl)
library(haven)

# ──────────────────────────────────────────────────────────────────────────────
# 1. Load and Clean ETS Data
# ──────────────────────────────────────────────────────────────────────────────

# Load additional states (ND, OR, TN)
additional_states <- read_excel("data/raw/ets/states_we_add_back_in.xlsx") %>%
  mutate(z_score = (passingscore - test_mean) / test_sd) %>%
  mutate(ID = as.character(ID))

# Load main ETS data
ets_cleaned <- read_stata("data/raw/ets/cleaned ppst and core and composite.dta") %>%
  select(-eftotlt) %>%
  bind_rows(additional_states) %>%
  # Fix PA data issues
  mutate(passingscore = if_else(State == "PA" & year >= 2016 & subject == "math", 150, passingscore)) %>%
  mutate(passingscore = if_else(State == "PA" & year >= 2017 & subject == "math", 142, passingscore)) %>%
  mutate(passingscore_composite = if_else(State == "PA" & year >= 2016 & subject == "math", 150, passingscore_composite)) %>%
  mutate(passingscore_composite = if_else(State == "PA" & year >= 2017 & subject == "math", 142, passingscore_composite))

# ──────────────────────────────────────────────────────────────────────────────
# 2. Handle State-Specific Issues (AR, CT, DE 2018 values)
# ──────────────────────────────────────────────────────────────────────────────

# Get 2016 values for states that need filling
ar_2016_scores <- ets_cleaned %>% filter(State == "AR" & year == 2016) %>% select(test, passingscore, passingscore_composite)
ct_2016_scores <- ets_cleaned %>% filter(State == "CT" & year == 2016) %>% select(test, passingscore, passingscore_composite)
de_2016_scores <- ets_cleaned %>% filter(State == "DE" & year == 2016) %>% select(test, passingscore, passingscore_composite)

# Replace 2018 and 2020 values
ets_cleaned <- ets_cleaned %>%
  mutate(
    passingscore = case_when(
      State == "AR" & year %in% c(2018, 2020) ~ ar_2016_scores$passingscore[match(test, ar_2016_scores$test)],
      State == "CT" & year %in% c(2018, 2020) ~ ct_2016_scores$passingscore[match(test, ct_2016_scores$test)],
      State == "DE" & year %in% c(2018, 2020) ~ de_2016_scores$passingscore[match(test, de_2016_scores$test)],
      TRUE ~ passingscore
    ),
    passingscore_composite = case_when(
      State == "AR" & year %in% c(2018, 2020) ~ ar_2016_scores$passingscore_composite[match(test, ar_2016_scores$test)],
      State == "CT" & year %in% c(2018, 2020) ~ ct_2016_scores$passingscore_composite[match(test, ct_2016_scores$test)],
      State == "DE" & year %in% c(2018, 2020) ~ de_2016_scores$passingscore_composite[match(test, de_2016_scores$test)],
      TRUE ~ passingscore_composite
    )
  ) %>%
  mutate(z_score = (passingscore - test_mean) / test_sd)

# ──────────────────────────────────────────────────────────────────────────────
# 3. Expand to Include Odd Years
# ──────────────────────────────────────────────────────────────────────────────

# Create odd year data by copying from adjacent even years
ets_2009 <- ets_cleaned %>% filter(year == 2008) %>% select(-year) %>% mutate(year = 2009)
ets_2011 <- ets_cleaned %>% filter(year == 2010) %>% select(-year) %>% mutate(year = 2011)
ets_2013 <- ets_cleaned %>% filter(year == 2010) %>% select(-year) %>% mutate(year = 2013)
ets_2015 <- ets_cleaned %>% filter(year == 2016) %>% select(-year) %>% mutate(year = 2015)
ets_2017 <- ets_cleaned %>% filter(year == 2016) %>% select(-year) %>% mutate(year = 2017) %>%
  mutate(passingscore = if_else(State == "PA" & year >= 2017 & subject == "math", 142, passingscore)) %>%
  mutate(passingscore_composite = if_else(State == "PA" & year >= 2017 & subject == "math", 142, passingscore_composite))
ets_2019 <- ets_cleaned %>% filter(year == 2018) %>% select(-year) %>% mutate(year = 2019)

# Combine all years
ets_expanded <- ets_cleaned %>%
  bind_rows(ets_2009, ets_2011, ets_2013, ets_2015, ets_2017, ets_2019) %>%
  arrange(State, year, subject, time) %>%
  # Fix SC 2019 issue
  mutate(passingscore_composite = if_else(State == "SC" & year == 2019, 158, passingscore_composite))

# ──────────────────────────────────────────────────────────────────────────────
# 4. Create Test Difficulty Index (TDI)
# ──────────────────────────────────────────────────────────────────────────────

ets_final <- ets_expanded %>%
  mutate(z_score_composite = ((passingscore_composite - test_mean) / test_sd)) %>%
  group_by(State, year) %>%
  mutate(
    z_score_total = sum(z_score, na.rm = TRUE),
    test_index = z_score_total / 3,  # Average of reading, writing, math z-scores
    z_score_composite_total = sum(z_score_composite, na.rm = TRUE),
    test_index_composite = z_score_composite_total / 3
  ) %>%
  ungroup()

# ──────────────────────────────────────────────────────────────────────────────
# 5. Create Treatment Variables
# ──────────────────────────────────────────────────────────────────────────────

treatment_data <- ets_final %>%
  filter(subject == "math") %>%  # Use one row per state-year
  select(-c(ID, passingscore, test, subject, test_name, test_mean, test_sd, z_score,
            passingscore_composite, z_score_composite, z_score_composite_total, z_score_total)) %>%
  group_by(State) %>%
  arrange(State, year) %>%
  # Create lead variables for treatment calculation
  mutate(
    test_index_lead1 = lag(test_index, 1),
    test_index_composite_lead1 = lag(test_index_composite, 1)
  ) %>%
  ungroup() %>%
  # Treatment = change from 2012 to 2014 (pre to post Praxis Core)
  mutate(
    test_index_lead1 = if_else(year != 2014, NA_real_, test_index_lead1),
    test_index_composite_lead1 = if_else(year != 2014, NA_real_, test_index_composite_lead1)
  ) %>%
  mutate(
    treat = test_index - test_index_lead1,
    composite_treat = test_index_composite - test_index_composite_lead1
  ) %>%
  group_by(State) %>%
  arrange(State, year) %>%
  fill(treat, composite_treat, .direction = "downup") %>%
  rename(
    continuous_treatment_amount = treat,
    continuous_composite_treatment_amount = composite_treat
  ) %>%
  fill(continuous_treatment_amount, continuous_composite_treatment_amount, .direction = "downup") %>%
  ungroup() %>%
  arrange(State, year) %>%
  # Treatment year = 2013 (transition year)
  mutate(
    treatment_year = 2013,
    time_till = year - treatment_year
  )

# ──────────────────────────────────────────────────────────────────────────────
# 6. Create Lead/Lag Variables for Event Study
# ──────────────────────────────────────────────────────────────────────────────

for (i in -5:7) {
  treatment_data <- treatment_data %>%
    mutate(
      !!paste0(if_else(i < 0, "lead", "lag"), abs(i)) := if_else(time_till == i, continuous_treatment_amount, NA_real_),
      !!paste0(if_else(i < 0, "composite_lead", "composite_lag"), abs(i)) := if_else(time_till == i, continuous_composite_treatment_amount, NA_real_)
    )
}

# Rename for clarity
treatment_data <- treatment_data %>%
  rename(
    continuous_treat = continuous_treatment_amount,
    continuous_composite_treat = continuous_composite_treatment_amount
  )

# Arrange columns
treatment_data <- treatment_data %>%
  select(
    State, year, treatment_year, time_till, test_index, test_index_lead1, continuous_treat,
    starts_with("lead"), starts_with("lag"),
    test_index_composite, test_index_composite_lead1, continuous_composite_treat,
    starts_with("composite_lead"), starts_with("composite_lag")
  )

# ──────────────────────────────────────────────────────────────────────────────
# 7. Save Output
# ──────────────────────────────────────────────────────────────────────────────

write_xlsx(treatment_data, "data/cleaned/ets_treatment_data.xlsx")

cat("  Created: data/cleaned/ets_treatment_data.xlsx\n")
cat("  States:", n_distinct(treatment_data$State), "\n")
cat("  Years:", min(treatment_data$year), "-", max(treatment_data$year), "\n")
cat("  Average delta TDI:", round(mean(treatment_data$continuous_treat, na.rm = TRUE), 3), "\n")
